#———————————————————————————————————————————– #IMPORTING NECESSARY PACKAGES.
library(data.table) # used for reading and manipulation of data
library(dplyr) # used for data manipulation and joining
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:data.table':
##
## between, first, last
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(ggplot2) # used for ploting
library(caret) # used for modeling
## Loading required package: lattice
library(corrplot) # used for making correlation plot
## corrplot 0.84 loaded
library(xgboost) # used for building XGBoost model
##
## Attaching package: 'xgboost'
## The following object is masked from 'package:dplyr':
##
## slice
library(cowplot) # used for combining multiple plots
library(caretEnsemble)
##
## Attaching package: 'caretEnsemble'
## The following object is masked from 'package:ggplot2':
##
## autoplot
library(VIM)
## Loading required package: colorspace
## Loading required package: grid
## VIM is ready to use.
## Suggestions and bug-reports can be submitted at: https://github.com/statistikat/VIM/issues
##
## Attaching package: 'VIM'
## The following object is masked from 'package:datasets':
##
## sleep
library(gridExtra)
##
## Attaching package: 'gridExtra'
## The following object is masked from 'package:dplyr':
##
## combine
#———————————————————————————————————————————–
#CODE TO BE FILLED
#Transforming "low fat" and "LF" to "Low Fat"
train_v9rqX0R_1_ <- read.csv("train_v9rqX0R (1).csv")
big_mart <- train_v9rqX0R_1_
index <- which(big_mart$Item_Fat_Content == "LF" |
big_mart$Item_Fat_Content == "low fat")
big_mart[index, "Item_Fat_Content"] <- "Low Fat"
#Transforming "reg" to "Regular
index2 <- which(big_mart$Item_Fat_Content == "reg")
big_mart[index2, "Item_Fat_Content"] <- "Regular"
#Dropping Unused Levels
#Using kNN imputation for missing values
big_mart_imputed <- kNN(big_mart)
big_mart_imputed <- big_mart_imputed %>%
select(Item_Identifier:Item_Outlet_Sales)
#write.csv()
summary(big_mart_imputed)
## Item_Identifier Item_Weight Item_Fat_Content Item_Visibility
## Length:8523 Min. : 4.555 Length:8523 Min. :0.00000
## Class :character 1st Qu.: 9.000 Class :character 1st Qu.:0.02699
## Mode :character Median :12.600 Mode :character Median :0.05393
## Mean :12.838 Mean :0.06613
## 3rd Qu.:16.500 3rd Qu.:0.09459
## Max. :21.350 Max. :0.32839
## Item_Type Item_MRP Outlet_Identifier
## Length:8523 Min. : 31.29 Length:8523
## Class :character 1st Qu.: 93.83 Class :character
## Mode :character Median :143.01 Mode :character
## Mean :140.99
## 3rd Qu.:185.64
## Max. :266.89
## Outlet_Establishment_Year Outlet_Size Outlet_Location_Type
## Min. :1985 Length:8523 Length:8523
## 1st Qu.:1987 Class :character Class :character
## Median :1999 Mode :character Mode :character
## Mean :1998
## 3rd Qu.:2004
## Max. :2009
## Outlet_Type Item_Outlet_Sales
## Length:8523 Min. : 33.29
## Class :character 1st Qu.: 834.25
## Mode :character Median : 1794.33
## Mean : 2181.29
## 3rd Qu.: 3101.30
## Max. :13086.97
table(big_mart_imputed$Outlet_Identifier, big_mart_imputed$Outlet_Size)
##
## High Medium Small
## OUT010 555 0 0 0
## OUT013 0 932 0 0
## OUT017 926 0 0 0
## OUT018 0 0 928 0
## OUT019 0 0 0 528
## OUT027 0 0 935 0
## OUT035 0 0 0 930
## OUT045 929 0 0 0
## OUT046 0 0 0 930
## OUT049 0 0 930 0
table(big_mart_imputed$Outlet_Identifier, big_mart_imputed$Outlet_Type)
##
## Grocery Store Supermarket Type1 Supermarket Type2 Supermarket Type3
## OUT010 555 0 0 0
## OUT013 0 932 0 0
## OUT017 0 926 0 0
## OUT018 0 0 928 0
## OUT019 528 0 0 0
## OUT027 0 0 0 935
## OUT035 0 930 0 0
## OUT045 0 929 0 0
## OUT046 0 930 0 0
## OUT049 0 930 0 0
table(big_mart$Outlet_Type, big_mart_imputed$Outlet_Size)
##
## High Medium Small
## Grocery Store 555 0 0 528
## Supermarket Type1 1855 932 930 1860
## Supermarket Type2 0 0 928 0
## Supermarket Type3 0 0 935 0
index3 <- which(big_mart_imputed$Outlet_Identifier == "OUT010")
big_mart_imputed[index3, "Outlet_Size"] <- "Small"
index4 <- which(big_mart_imputed$Outlet_Identifier == "OUT017")
big_mart_imputed[index4, "Outlet_Size"] <- "Small"
index5 <- which(big_mart_imputed$Outlet_Identifier == "OUT045")
big_mart_imputed[index5, "Outlet_Size"] <- "Small"
big_mart_imputed$Outlet_Size <- factor(big_mart_imputed$Outlet_Size)
summary(big_mart_imputed)
## Item_Identifier Item_Weight Item_Fat_Content Item_Visibility
## Length:8523 Min. : 4.555 Length:8523 Min. :0.00000
## Class :character 1st Qu.: 9.000 Class :character 1st Qu.:0.02699
## Mode :character Median :12.600 Mode :character Median :0.05393
## Mean :12.838 Mean :0.06613
## 3rd Qu.:16.500 3rd Qu.:0.09459
## Max. :21.350 Max. :0.32839
## Item_Type Item_MRP Outlet_Identifier
## Length:8523 Min. : 31.29 Length:8523
## Class :character 1st Qu.: 93.83 Class :character
## Mode :character Median :143.01 Mode :character
## Mean :140.99
## 3rd Qu.:185.64
## Max. :266.89
## Outlet_Establishment_Year Outlet_Size Outlet_Location_Type
## Min. :1985 High : 932 Length:8523
## 1st Qu.:1987 Medium:2793 Class :character
## Median :1999 Small :4798 Mode :character
## Mean :1998
## 3rd Qu.:2004
## Max. :2009
## Outlet_Type Item_Outlet_Sales
## Length:8523 Min. : 33.29
## Class :character 1st Qu.: 834.25
## Mode :character Median : 1794.33
## Mean : 2181.29
## 3rd Qu.: 3101.30
## Max. :13086.97
#write.csv(big_mart_imputed,"C:/Users/jayanth/Desktop/Data analytics lab/sales_cleaned.csv", row.names = FALSE)
sales_cleaned <- read.csv("sales_cleaned.csv")
#———————————————————————————————————————————–
#———————————————————————————————————————————–
#1. What are the types of outlets, outlet locations, and outlet sizes in the given dataset and what is their count?
table(sales_cleaned$Outlet_Location_Type)
##
## Tier 1 Tier 2 Tier 3
## 2388 2785 3350
table(sales_cleaned$Outlet_Type)
##
## Grocery Store Supermarket Type1 Supermarket Type2 Supermarket Type3
## 1083 5577 928 935
table(sales_cleaned$Outlet_Size)
##
## High Medium Small
## 932 3722 3869
#Q2. WHAT IS THE RELATIONSHIP BETWEEN OUTLET TYPE AND OUTLET SIZE?
table(sales_cleaned$Outlet_Type , sales_cleaned$Outlet_Size)
##
## High Medium Small
## Grocery Store 0 0 1083
## Supermarket Type1 932 1859 2786
## Supermarket Type2 0 928 0
## Supermarket Type3 0 935 0
#Q3.COUNT OF DIFFERENT ITEM TYPES IN THE VARIOUS OUTLET TYPES.
table(sales_cleaned$Item_Type , sales_cleaned$Outlet_Type)
##
## Grocery Store Supermarket Type1 Supermarket Type2
## Baking Goods 85 426 68
## Breads 33 160 27
## Breakfast 19 68 12
## Canned 73 426 78
## Dairy 92 450 73
## Frozen Foods 103 572 92
## Fruits and Vegetables 152 805 135
## Hard Drinks 24 145 22
## Health and Hygiene 67 335 58
## Household 119 597 95
## Meat 66 257 46
## Others 27 107 20
## Seafood 10 40 7
## Snack Foods 146 785 132
## Soft Drinks 54 300 46
## Starchy Foods 13 104 17
##
## Supermarket Type3
## Baking Goods 69
## Breads 31
## Breakfast 11
## Canned 72
## Dairy 67
## Frozen Foods 89
## Fruits and Vegetables 140
## Hard Drinks 23
## Health and Hygiene 60
## Household 99
## Meat 56
## Others 15
## Seafood 7
## Snack Foods 137
## Soft Drinks 45
## Starchy Foods 14
data1 <- sales_cleaned
#Q4. THE COUNT OF THE OUTLET SIZES IN EACH OUTLET LOCATION.
table(data1$Outlet_Location_Type , data1$Outlet_Size)
##
## High Medium Small
## Tier 1 0 930 1458
## Tier 2 0 929 1856
## Tier 3 932 1863 555
#Q5. ITEM TYPE AND IT’S AVERAGE MRP
df3 <- data1 %>% group_by(Item_Type ) %>% summarise(mean_Item_MRP = mean(Item_MRP, na.rm=TRUE))
df3
#Q6. AVERAGE OUTLET SALES OF EACH OUTLET LOCATION TYPE.
df4 <- data1 %>% group_by(Outlet_Location_Type ) %>% summarise(mean_outlet_sales = mean(Item_Outlet_Sales, na.rm=TRUE)) %>% arrange(desc(mean_outlet_sales))
df4
#———————————————————————————————————————————-
#Q7. WHAT IS THE RELATION BETWEEN THE SALES AMOUNT AND THE TYPE OF OUTLET?
sales_cleaned %>% group_by(Outlet_Type) %>% summarise(Mean_sales_per_outlet= mean(Item_Outlet_Sales)) %>% arrange(desc(Mean_sales_per_outlet))
#Q8. HOW ARE OUTLET LOCATION AND OUTLET TYPE RELATED?
#sales_cleaned %>% select(Outlet_Location_Type,Outlet_Type) %>% group_by(Outlet_Location_Type) %>% summarise()
table(sales_cleaned$Outlet_Location_Type, sales_cleaned$Outlet_Type)
##
## Grocery Store Supermarket Type1 Supermarket Type2 Supermarket Type3
## Tier 1 528 1860 0 0
## Tier 2 0 2785 0 0
## Tier 3 555 932 928 935
#Q9. RELATIONSHIP BETWEEN YEAR OF ESTABLSIHMENT OF AN OUTLET AND THE OUTLET TYPE.
table(sales_cleaned$Outlet_Establishment_Year, sales_cleaned$Outlet_Type)
##
## Grocery Store Supermarket Type1 Supermarket Type2 Supermarket Type3
## 1985 528 0 0 935
## 1987 0 932 0 0
## 1997 0 930 0 0
## 1998 555 0 0 0
## 1999 0 930 0 0
## 2002 0 929 0 0
## 2004 0 930 0 0
## 2007 0 926 0 0
## 2009 0 0 928 0
#Q10. RELATIONSHIP BETWEEN THE MRP AND THE ITEM VISIBILTIY.
res <- cor.test(sales_cleaned$Item_MRP, sales_cleaned$Item_Visibility,
method = "pearson")
res
##
## Pearson's product-moment correlation
##
## data: sales_cleaned$Item_MRP and sales_cleaned$Item_Visibility
## t = -0.12137, df = 8521, p-value = 0.9034
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
## -0.02254486 0.01991635
## sample estimates:
## cor
## -0.001314848
#Q11. RELATIONSHIP BETWEEN THE ITEM TYPE AND VISIBILITY.
sales_cleaned %>% select(Item_Visibility, Item_Type) %>% group_by(Item_Type) %>% summarise(avg_visibility= mean(Item_Visibility)) %>% arrange(desc(avg_visibility))
#Q12. RELATIONSHIP BETWEEN SALES AND OUTLET SIZE?
sales_cleaned %>% group_by(Outlet_Size) %>% summarise(Mean_sales_per_outlet= mean(Item_Outlet_Sales)) %>% arrange(desc(Mean_sales_per_outlet))
#Q13. AVERAGE SALES BY OUTLET SIZE.
df6 <- data1 %>% group_by(Outlet_Size ) %>% summarise(mean_outlet_sales = mean(Item_Outlet_Sales, na.rm=TRUE)) %>% arrange(desc(mean_outlet_sales))
df6
#———————————————————————————————————————————-
#1. COUNT OF RETAIL CHAINS ESTABLISHED EVERY YEAR.
df1 <- sales_cleaned
x <- ggplot(data=sales_cleaned,aes(x=Outlet_Establishment_Year)) + geom_histogram(binwidth = 01, color='red',fill='blue') + xlab('YEAR OF ESTABLISHMENT') + ylab('Frequency') + ggtitle('HISTOGRAM PLOT OF YEAR OF ESTABLISHMENT')
library(plotly)
##
## Attaching package: 'plotly'
## The following object is masked from 'package:xgboost':
##
## slice
## The following object is masked from 'package:ggplot2':
##
## last_plot
## The following object is masked from 'package:stats':
##
## filter
## The following object is masked from 'package:graphics':
##
## layout
ggplotly(x)
p0 <- ggplot(sales_cleaned) + geom_histogram(aes(sales_cleaned$Item_Outlet_Sales), binwidth = 100, fill = "darkgreen") +
xlab("Item_Outlet_Sales")
p0
## Warning: Use of `sales_cleaned$Item_Outlet_Sales` is discouraged. Use
## `Item_Outlet_Sales` instead.
library(cowplot)
combi=sales_cleaned
p1 = ggplot(combi) + geom_histogram(aes(Item_Weight), binwidth = 0.5, fill = "blue")
p2 = ggplot(combi) + geom_histogram(aes(Item_Visibility), binwidth = 0.005, fill = "blue")
p3 = ggplot(combi) + geom_histogram(aes(Item_MRP), binwidth = 1, fill = "blue")
plot_grid(p1, p2, p3, nrow = 1) # plot_grid() from cowplot package
#Independent Variables(categorical variables)
ggplot(combi %>% group_by(Item_Fat_Content) %>% summarise(Count = n())) +
geom_bar(aes(Item_Fat_Content, Count), stat = "identity", fill = "coral1")
# plot for Item_Type
p4 = ggplot(combi %>% group_by(Item_Type) %>% summarise(Count = n())) +
geom_bar(aes(Item_Type, Count), stat = "identity", fill = "coral1") +
xlab("") +
geom_label(aes(Item_Type, Count, label = Count), vjust = 0.5) +
theme(axis.text.x = element_text(angle = 45, hjust = 1))+
ggtitle("Item_Type")
ggplotly(p4)
## Warning in geom2trace.default(dots[[1L]][[1L]], dots[[2L]][[1L]], dots[[3L]][[1L]]): geom_GeomLabel() has yet to be implemented in plotly.
## If you'd like to see this geom implemented,
## Please open an issue with your example code at
## https://github.com/ropensci/plotly/issues
p5 = ggplot(combi %>% group_by(Outlet_Identifier) %>% summarise(Count = n())) +
geom_bar(aes(Outlet_Identifier, Count), stat = "identity", fill = "coral1") +
geom_label(aes(Outlet_Identifier, Count, label = Count), vjust = 0.5) +
theme(axis.text.x = element_text(angle = 45, hjust = 1))
ggplotly(p5)
## Warning in geom2trace.default(dots[[1L]][[1L]], dots[[2L]][[1L]], dots[[3L]][[1L]]): geom_GeomLabel() has yet to be implemented in plotly.
## If you'd like to see this geom implemented,
## Please open an issue with your example code at
## https://github.com/ropensci/plotly/issues
p6 = ggplot(combi %>% group_by(Outlet_Size) %>% summarise(Count = n())) +
geom_bar(aes(Outlet_Size, Count), stat = "identity", fill = "coral1") +
geom_label(aes(Outlet_Size, Count, label = Count), vjust = 0.5) +
theme(axis.text.x = element_text(angle = 45, hjust = 1))
second_row = plot_grid(p5, p6, nrow = 1)
plot_grid(p4, second_row, ncol = 1)
ggplotly(p6)
## Warning in geom2trace.default(dots[[1L]][[1L]], dots[[2L]][[1L]], dots[[3L]][[1L]]): geom_GeomLabel() has yet to be implemented in plotly.
## If you'd like to see this geom implemented,
## Please open an issue with your example code at
## https://github.com/ropensci/plotly/issues
library(ggplot2)
p8 = ggplot(combi %>% group_by(Outlet_Type) %>% summarise(Count = n())) +
geom_bar(aes(Outlet_Type, Count), stat = "identity", fill = "coral1") +
geom_label(aes(factor(Outlet_Type), Count, label = Count), vjust = 0.5) +
theme(axis.text.x = element_text(size = 8.5))
library(plotly)
ggplotly(p8)
## Warning in geom2trace.default(dots[[1L]][[1L]], dots[[2L]][[1L]], dots[[3L]][[1L]]): geom_GeomLabel() has yet to be implemented in plotly.
## If you'd like to see this geom implemented,
## Please open an issue with your example code at
## https://github.com/ropensci/plotly/issues
# Item_Visibility vs Item_Outlet_Sales
p10 = ggplot(sales_cleaned) +
geom_point(aes(Item_Visibility, Item_Outlet_Sales), colour = "violet", alpha = 0.3) +
theme(axis.title = element_text(size = 8.5))
p10
#item type vs outlet sales
#item type vs outlet sales
qplot(x = sqrt(Item_Outlet_Sales), data = data1, binwidth = 1,
ylab = "Sales Distribution",
xlab = "SQRT of Outlet Sales",
fill=I("tomato")) +
theme(axis.text.x = element_text(angle = 90),
axis.text.y = element_text(angle = 30)) +
theme_minimal() +
scale_x_continuous(limits = c(0,100), breaks = seq(0,120,15)) +
scale_y_continuous(limits = c(0,40), breaks = seq(0,40,10)) +
facet_wrap(~Item_Type)
## Warning: Removed 8 rows containing non-finite values (stat_bin).
## Warning: Removed 32 rows containing missing values (geom_bar).
qplot(x = Item_Type, y = sqrt(Item_Outlet_Sales),
ylab = "SQRT of Outlet Sales",
data = data1,
geom = "boxplot",
fill=I("tomato")) +
theme_minimal() +
theme(axis.text.x = element_text(angle = 90),
axis.text.y = element_text(angle = 30))
#outlet identifier vs outlet sales
qplot(x = sqrt(Item_Outlet_Sales), data = data1, binwidth = 1,
ylab = "Sales Distribution",
xlab = "SQRT of Outlet Sales",
fill=I("tomato")) +
theme(axis.text.x = element_text(angle = 90),
axis.text.y = element_text(angle = 30)) +
theme_minimal() +
scale_x_continuous(limits = c(0,100), breaks = seq(0,120,15)) +
scale_y_continuous(limits = c(0,40), breaks = seq(0,40,10)) +
facet_wrap(~Outlet_Identifier,2)
## Warning: Removed 8 rows containing non-finite values (stat_bin).
## Warning: Removed 23 rows containing missing values (geom_bar).
qplot(x = Outlet_Identifier, y = sqrt(Item_Outlet_Sales),
color = Outlet_Type,
ylab = "SQRT of Outlet Sales",
data = data1,
geom = "boxplot",
fill=Outlet_Type) +
geom_vline(aes(xintercept=quantile(data1$Item_Outlet_Sales, c(.01))), color="red", linetype="dashed", size=1) +
theme_minimal() +
theme(axis.text.x = element_text(angle = 90),
axis.text.y = element_text(angle = 30))
## Warning: Use of `data1$Item_Outlet_Sales` is discouraged. Use
## `Item_Outlet_Sales` instead.
#outlet sales vs fat content
qplot(x = sqrt(Item_Outlet_Sales), data = data1, binwidth = 1,
ylab = "",
xlab = "SQRT of Outlet Sales",
fill=I("tomato")) +
theme(axis.text.x = element_text(angle = 90),
axis.text.y = element_text(angle = 30)) +
scale_x_continuous(limits = c(0,100), breaks = seq(0,120,15)) +
scale_y_continuous(limits = c(0,200), breaks = seq(0,200,100)) +
theme_minimal() +
facet_wrap(~Item_Fat_Content)
## Warning: Removed 8 rows containing non-finite values (stat_bin).
## Warning: Removed 4 rows containing missing values (geom_bar).
qplot(x = Item_Fat_Content, y = sqrt(Item_Outlet_Sales),
ylab = "SQRT of Outlet Sales",
data = data1,
geom = "boxplot",
fill=I("tomato")) +
theme_minimal() +
theme(axis.text.x = element_text(angle = 0),
axis.text.y = element_text(angle = 30))
#outlet size vs outlet sales
qplot(x = sqrt(Item_Outlet_Sales), data = sales_cleaned, binwidth = 0.5,
ylab = "Sales Distribution",
xlab = "SQRT of Outlet Sales",
fill=I("tomato")) +
theme(axis.text.x = element_text(angle = 90),
axis.text.y = element_text(angle = 30)) +
theme_minimal() +
scale_x_continuous(limits = c(0,100), breaks = seq(0,120,15)) +
scale_y_continuous(limits = c(0,40), breaks = seq(0,40,10)) +
facet_wrap(~Outlet_Size)
## Warning: Removed 8 rows containing non-finite values (stat_bin).
## Warning: Removed 41 rows containing missing values (geom_bar).
qplot(x = Outlet_Size, y = sqrt(Item_Outlet_Sales),
ylab = "SQRT of Outlet Sales",
data = data1,
geom = "boxplot",
fill=I("tomato")) +
theme_minimal() +
theme(axis.text.x = element_text(angle = 90),
axis.text.y = element_text(angle = 30))
getwd()
## [1] "/Users/roviros/Desktop/EDA THEORY"